; Name:         MT19937-32
; Build:        see makefile
; Run:          ./MT19937-32
; Description:  Mersenne twister algorithm to generate 32 bits unsigned integers.
; Source:       http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
;               http://en.wikipedia.org/wiki/Mersenne_twister
; Remarks:      The routine GenerateNumbers can be called, the Initialize subroutine does this
;               before returning to the caller, in case the user forgets it.

%define SYS_EXIT            60
%define SYS_CLOCK_GETTIME   228
%define SYS_WRITE           1
%define STDOUT              1
%define CLOCK_REALTIME      0
%define NUMBER              10000

bits 64

STRUC TIME
  .sec:  resq    1
  .nsec: resq    1
ENDSTRUC

; use BUFFER name, type, number
%macro BUFFER 3
  %1:
    %if %2=="BYTE"
        %assign n 1
    %elif %2=="WORD"
        %assign n 2
    %elif %2=="DWORD"
        %assign n 4
    %elif %2=="QWORD"
        %assign n 8
    %endif
    .start: resb %3*n
    .end:
    .length: equ %1.end-%1.start
%endmacro

%macro STRING 2
  %1:
    .start: db %2
    .end:
    .length: equ %1.end-%1.start
%endmacro

%macro TIME 1
  %define %1.sec  %1+TIME.sec
  %define %1.nsec %1+TIME.nsec
  
  %1: ISTRUC TIME
    at  TIME.sec,  dq 0
    at  TIME.nsec, dq 0
  IEND
%endmacro

section .bss

  ; BE CAREFULL CHANGING THIS!! It's used by the Mersenne pseudo random generator subroutines.
  BUFFER index,"WORD",1
  BUFFER MT,"DWORD",624
  ; used in ConvertToDecimal  
  BUFFER decimalbuffer,"BYTE",20
  ; used in main program loop
  BUFFER testdata,"DWORD",NUMBER
  
section .data
  STRING sTitle, {"Implementation of the 32 bits Mersenne Twister.",10,"by Agguro - 2013",10}
  STRING sEOL, {10}
  STRING sSeparator, ","
  STRING sSpace, " "
  STRING sPeriod, "."
  STRING sSec, "s"
  STRING sExecTime, "Execution time to generate and store 10000 integers: "
  TIME   starttime
  TIME   endtime
      
section .text
        global _start
        
_start:

      mov   rsi, sTitle
      mov   rdx, sTitle.length
      call  Write
      ; initialize the Mersenne Twister
      xor   rdi, rdi                        ; autogenerate seed for us
      call  Initialize
      ; 1000 random 32 bit words will be generated to measure the time needed
      mov   rcx, NUMBER
      mov   rdi, testdata
      ; get starttime
      call  GetTime
      mov   QWORD[starttime.sec], rdx
      mov   QWORD[starttime.nsec], rax
.next:
      push  rdi
      call  ExtractNumber
      pop   rdi
      stosd
      loop  .next
      ; get endtime
      call  GetTime     
      mov   QWORD[endtime.sec], rdx
      mov   QWORD[endtime.nsec], rax
      ; send all integers to STDOUT
      mov   rsi, testdata
      mov   rcx, NUMBER
.nextNumber:
      xor   rax, rax                    ; clear upper 32 bits
      lodsd                             ; load integer in EAX
      push  rsi
      call  ConvertToDecimal
      push  rcx
      push  rsi
      push  rdx
      ; right align integers with alignment length 12 bytes
      mov   rcx, rdx
      sub   rcx, 12
      jz    .noalignment                ; if no difference in length no alignment
      neg   rcx
.space:
      mov   rsi, sSpace                 ; write space to align
      call  WriteChar
      loop  .space
.noalignment:      
      pop   rdx                         ; restore pointer to integer and length
      pop   rsi
      pop   rcx                         ; restore loopcounter
      call  Write
      mov   rax, rcx
      dec   rax
      mov   rbx, 10
      xor   rdx, rdx
      idiv  rbx
      cmp   rdx, 0                      ; if loopcounter mod 10 = 0 then tenth column
      jne   .sameline                   ; stay on same line
      mov   rsi, sEOL                   ; go to next line
      jmp   .printchar
.sameline:      
      mov   rsi, sSeparator
.printchar:      
      call  WriteChar
      pop   rsi
      loop  .nextNumber                 ; repeat for all numbers
      mov   rsi, sEOL                   ; end of table
      mov   rdx, 1
      call  Write
      mov   rsi, sExecTime
      mov   rdx, sExecTime.length
      call  Write
      ; calculate the time difference
      ; if end nanoseconds are less than start nanoseconds we need to adjust
      mov   rax, QWORD[endtime.nsec]
      sub   rax, QWORD[starttime.nsec]
      cmp   rax, 0
      jge   .calculate
      neg   rax
      sub   QWORD[endtime.sec], 1       ; 1 second less     
.calculate:
      push  rax                         ; store nanosec difference on stack
      mov   rax, QWORD[endtime.sec]
      sub   rax, QWORD[starttime.sec]
      call  ConvertToDecimal            ; RSI has pointer to buffer, RDX length
      call  Write
      mov   rsi, sPeriod                ; write decimal point
      mov   rdx, 1
      call Write
      pop   rax                         ; restore nanosec difference on stack
      call  ConvertToDecimal
      cmp   rdx, 9
      je    .noleadingzero
      mov   rcx, 9
      sub   rcx, rdx
.leadingzero:      
      dec   rsi
      inc   rdx
      mov   BYTE[rsi],"0"
      loop  .leadingzero
.noleadingzero:
      call  Write     
      mov   rsi, sSec
      mov   rdx, 1
      call  Write
.done:      
      mov   rsi, sEOL
      mov   rdx, 1
      call  Write
      xor   rdi, rdi
      mov   rax, SYS_EXIT
      syscall
;***********************************************************************************************
; Write
; Write a string pointed by RSI with length RDX to STDOUT
; All used registers are restored.
;***********************************************************************************************
Write:
      push  rax
      push  rcx
      push  rdi
      mov   rdi, STDOUT
      mov   rax, SYS_WRITE
      syscall
      pop   rdi
      pop   rcx
      pop   rax
      ret
;***********************************************************************************************
; WriteChar
; Write a character pointed by RSI to STDOUT
; all used registers are restored.
;***********************************************************************************************
WriteChar:
      push  rdx
      mov   rdx, 1
      call  Write
      pop   rdx
      ret
;***********************************************************************************************
; Start of MT19937 subroutines
;***********************************************************************************************
; Initialize
; ----------
; The routine accepts a seed in RDI, if this value is zero then a seed will be generated by the
; program.
; use:      mov     rdi, [seed or 0]
;           call    InitGenerator
; return:   none.
;***********************************************************************************************
Initialize:
    push    rax
    push    rbx
    push    rcx
    push    rdx
    push    rdi
    push    rsi
    mov     WORD[index], 0                        ; initialize the index with 0
    cmp     rax, 0
    jne     .start
    call    GenerateSeed                          ; seed in RAX
  .start:  
    mov     rdi, MT                               ; store pointer to MT in RDI
    mov     ecx, 1                                ; start with MT[1], RCX is i
    stosd                                       ; save seed in MT[0]
    mov     rsi, rdi                              ; RDI points to MT[i]
    sub     rsi, 4                                ; RSI points to MT[i-1]
  .nexti:    
    push    rax                                   ; MT[i-1] on stack
    ; ** right shift by 30 bits(MT[i-1])
    shr     eax, 30                     
    mov     ebx, eax
    pop     rax                                   ; get M[i-1] back
    ; **  MT[i-1] xor (right shift by 30 bits(MT[i-1])
    xor     eax, ebx                    
    mov     ebx, 0x6c078965                       ; 1812433253
    xor     rdx, rdx                              ; prepare multiplication
    ; **  1812433253 * (MT[i-1] xor (right shift by 30 bits(MT[i-1]))
    imul    ebx
    ; **  1812433253 * (MT[i-1] xor (right shift by 30 bits(MT[i-1]))) + i
    add     eax, ecx
    ; **  last 32 bits of(1812433253 * (MT[i-1] xor (right shift by 30 bits(MT[i-1]))) + i)
    stosd                                       ; save M[n]  RDI
    inc     ecx                                   ; i++
    cmp     ecx, 623                              ; repeat until i > 623
    jle     .nexti
    call    GenerateNumbers
    pop    rsi
    pop    rdi
    pop    rdx
    pop    rcx
    pop    rbx
    pop    rax
    ret
;***********************************************************************************************
; Generate
; --------
; The routine generates the random numbers. The Initialize subroutine calls this subroutine
; after the buffer is initialized with numbers based on the seed.
;***********************************************************************************************              
GenerateNumbers:
    push    rax
    push    rbx
    push    rcx
    push    rdx
    push    rsi   
    xor     rcx, rcx
  .repeat:
    mov     rsi, MT
    mov     ebx, ecx
    shl     ebx, 2                              ; offset in MT array
    ; ** y = (MT[i] and 0x80000000) + (MT[(i+1) mod 624] and 0x7FFFFFFF)
    ; **** calculate MT[i] and 0x80000000
    mov     eax, DWORD[rsi+rbx]
    and     eax, 0x80000000
    push    rax                                 ; MT[i] and 0x80000000
    ; calculate MT[(i+1) mod 624] and 0x7FFFFFFF
    mov     eax, ecx                            ; 
    inc     eax                                 ; i+1
    mov     ebx, 624
    xor     edx, edx
    idiv    ebx
    mov     ebx, edx                            ; (i+1) mod 624
    shl     ebx, 2
    mov     eax, DWORD[rsi+rbx]
    and     eax, 0x7FFFFFFF                     ; MT[(i+1) mod 624] and 0x7FFFFFFF
    pop     rbx                                 ; MT[i] and 0x80000000
    add     eax, ebx                            ; y
    push    rax                                 ; y on stack
    ; **** calculate MT[i] = MT[(i+397) mod 624] xor (right shift y by 1)
    shr     eax, 1                              ; y SHR 1
    push    rax                                 ; y SHR 1 on stack
    mov     eax, ecx                            ; i
    mov     ebx, 397                   
    add     eax, ebx                            ; i+397
    mov     ebx, 624
    xor     rdx, rdx
    idiv    ebx                                 ; (i+397) mod 624
    mov     ebx, edx                            ; in RBX
    shl     ebx, 2
    mov     eax, DWORD[rsi+rbx]                 ; MT[(i+397) mod 624]
    pop     rbx                                 ; y SHR 1 of stack
    xor     eax, ebx                            ; MT[(i+397) mod 624] xor (y SHR 1)
    mov     ebx, ecx
    shl     ebx, 2
    and     eax, 0x7FFFFFFF                     ; remove highest 33 bits
    mov     DWORD[rsi+rbx], eax                 ; save MT[i]
    pop     rax                                 ; y
    rcr     eax, 1                              ; CF = y mod 2
    jnc     .done
    mov     eax, DWORD[rsi+rbx]                 ; MT[i]
    xor     eax, 2567483615                     ; MT[i] xor 2567483615
    and     eax, 0x7FFFFFFF                     ; remove highest 33 bits
    mov     DWORD[rsi+rbx], eax
  .done:    
    inc     rcx
    cmp     ecx, 624
    jne     .repeat
    pop    rsi
    pop    rdx
    pop    rcx
    pop    rbx
    pop    rax
    ret
;***********************************************************************************************
; ExtractNumber
; -------------
; The routine extracts a number of the list and returns this in RAX, each 624 times it calls
; GenerateNumbers to create a new list of pseudo-random numbers.
;***********************************************************************************************
ExtractNumber:
    push    rbx
    push    rcx
    push    rdx
    push    rsi 
    xor     rax, rax
    mov     ax, WORD[index]
    push    rax                                 ; index on stack
    inc     eax                                 ; increment index
    mov     ebx, 624
    xor     rdx, rdx
    idiv    ebx
    mov     WORD[index], dx
    pop     rbx
    cmp     ebx, 0
    jne     .getnumber
    push    rbx                                 ; save i
    call    GenerateNumbers
    pop     rbx                                 ; restore i
.getnumber:
    mov     rsi, MT
    shl     rbx, 2
    ; ** int y := MT[index]
    mov     DWORD eax,[rsi+rbx]
    ; ** y := y xor (right shift by 11 bits(y))
    mov     edx, eax
    shr     edx, 11
    xor     eax, edx
    mov     edx, eax
    ; ** y := y xor (left shift by 7 bits(y) and (2636928640))
    shl     edx, 7
    and     edx, 2636928640
    xor     eax, edx                        
    mov     edx, eax
    ; ** y := y xor (left shift by 15 bits(y) and (4022730752))
    shl     edx, 15
    and     edx, 4022730752                     
    xor     eax, edx
    mov     edx, eax
    shr     edx, 18
    ; ** y := y xor (right shift by 18 bits(y))
    xor     eax, edx
    and     eax, 0x7FFFFFFF                     ; extracted number
    pop     rsi
    pop     rdx
    pop     rcx
    pop     rbx
    ret
;***********************************************************************************************
; GenerateSeed
; ------------
; This is just a helper subroutine, in case you don't want to provide seeds by your own.
; The routine creates a 32 bits unsigned integer based on ClockGetTime.
; Also remember, even if RCX isn't used it is changed by the syscalls.  The stackpointer will
; be used as pointer to the TIMESPEC.
; CALLING : call CreateSeed
; IN      : none
; OUT     : seed in RAX
; Stack layout after initializing stackframe:
; RSP +  0      value of seconds
; RSP +  8      value of nanoseconds
;***********************************************************************************************
GenerateSeed:
    push    rbp
    mov     rbp, rsp
    sub     rsp, 2*8                ; reserve 2 QWORDS
    mov     rsi, rsp
    xor     rdi, rdi
    mov     rax, SYS_CLOCK_GETTIME
    syscall
    mov     rax, [rsp]              ; get seconds
    mov     rbx, [rsp+8]            ; get nanosecs
    imul    rbx
    shl     rax, 32
    shr     rax, 32
    mov     rsp, rbp
    pop     rbp
    ret
;***********************************************************************************************
; COMMON SUBROUTINES
;***********************************************************************************************
; ConvertTodecimal
; ----------------
; Convert the register value in RAX to decimal ASCII and stores it in a buffer.  A pointer to
; the start of the decimal in the buffer is returned in RSI, the length in RDX.
; The purpose is to use this subroutine before writing the contents of RAX in decimal on STDOUT.
; A buffer "decimalbuffer" must be defined in the .bss or the .data section with a length
; suitable to store a decimal ASCII number in.         
;***********************************************************************************************
ConvertToDecimal:
    pushfq
    mov     rdi, decimalbuffer      ; point to decimal buffer, decimal will be there
    add     rdi, 19                 ; point to end of buffer
    mov     rbx, 10                 ; base 10
    std
.repeat:    
    xor     rdx, rdx
    idiv    rbx                     ; RAX = quotient, RDX = remainder
    xchg    rax, rdx                ; change quotient and remainder
    or      al, 0x30                ; make ASCII
    stosb
    xchg    rax, rdx                ; restore quotient
    cmp     rax, 0                  ; calculations left?
    jne     .repeat
    inc     rdi                     ; adjust pointer
    mov     rsi, rdi
    mov     rdx, decimalbuffer.end
    sub     rdx, rdi                ; length of the decimal
    popfq
    ret
;***********************************************************************************************
; GetTime
; -------
; This subroutine returns the time in RDX:RAX
; IN  : none
; OUT : RDX = seconds
;       RAX = nanoseconds
;***********************************************************************************************
GetTime:
    push    rsi
    push    rdi
    push    rcx
    push    rbp
    mov     rbp, rsp
    sub     rsp, 16                 ; reserve 2 QWORDS
    mov     rsi, rsp
    xor     rdi, rdi                ; CLOCK_REALTIME
    mov     rax, SYS_CLOCK_GETTIME
    syscall
    mov     rdx, [rsp]              ; seconds
    mov     rax, [rsp+8]            ; nanoseconds
    mov     rsp, rbp
    pop     rbp
    pop     rcx
    pop     rdi
    pop     rsi
    ret
;***********************************************************************************************